1 Setup

suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(parallel)
  library(pander)
  library(plotly)
  library(RColorBrewer)
  library(tidyverse)
  library(tximport)
  library(vsn)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv(here("doc/variables.csv"),
                      col_types=cols(
                        col_character(),
                        col_factor(),
                        col_factor(),
                        col_factor(),
                        col_factor()
                      ))

2 Raw data

tx2gene translation table

tx2gene <- suppressMessages(read_delim(here("reference/annotation/tx2gene.txt"),delim="\t",
                                       col_names=c("TXID","GENE")))

3 Raw data

filelist <- list.files(here("data/salmon"), 
                       recursive = TRUE, 
                       pattern = "quant.sf",
                       full.names = TRUE)

Sanity check to ensure that the data is sorted according to the sample info

names(filelist) <- sub("_S\\d+.*","",basename(dirname(filelist)))
stopifnot(all(samples$ID==names(filelist)))

Read the expression at the gene level

This gives us directly gene counts

counts <- suppressMessages(round(tximport(files = filelist, 
                                          type = "salmon",
                                          tx2gene=tx2gene)$counts))

# counts <- summarizeToGene(tx,tx2gene=tx2gene)

3.1 Quality Control

  • Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "17.8% percent (5766) of 32310 genes are not expressed"
  • Let us take a look at the sequencing depth, colouring by CHANGEME
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>% 
  bind_cols(samples)

ggplot(dat,aes(x,y,fill=TISSUE)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())

  • Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 5766 rows containing non-finite values (stat_density).

Also removing P14065_119

ggplot(data.frame(value=log10(rowMeans(counts[,colnames(counts)!="P14065_119"]))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 5769 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by CHANGEME.

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(TISSUE=samples$TISSUE[match(ind,samples$ID)]) %>% 
  mutate(TIME=samples$TIME[match(ind,samples$ID)]) %>% 
  mutate(TREATMENT=samples$TREATMENT[match(ind,samples$ID)])

ggplot(dat,aes(x=values,group=ind,col=TISSUE)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 399816 rows containing non-finite values (stat_density).

Removing P14065_119

dat <- as.data.frame(log10(counts[,colnames(counts)!="P14065_119"])) %>% utils::stack() %>% 
  mutate(TISSUE=samples$TISSUE[match(ind,samples$ID)]) %>% 
  mutate(TIME=samples$TIME[match(ind,samples$ID)]) %>% 
  mutate(TREATMENT=samples$TREATMENT[match(ind,samples$ID)])

ggplot(dat,aes(x=values,group=ind,col=TISSUE)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 373852 rows containing non-finite values (stat_density).

3.2 Export

dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write_csv(as.data.frame(counts) %>% rownames_to_column("ID"),
          here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))

4 Data normalisation

4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples,
  design = ~ TISSUE * CONDITION)
## converting counts to integer mode

4.2 Remove bad sample

dds <- dds[,-match("P14065_119",colnames(dds))]

save(dds,file=here("data/analysis/salmon/dds.rda"))

Check the size factors (i.e. the sequencing library size effect)

There is some variation -/+ 50%, but that’s acceptable

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P14065_101 P14065_102 P14065_103 P14065_104 P14065_105 P14065_106
0.8441 0.5639 0.7496 0.7239 1.015 0.8578
Table continues below
P14065_107 P14065_108 P14065_109 P14065_110 P14065_111 P14065_112
1.061 0.5505 0.7329 0.6929 0.4535 0.9757
Table continues below
P14065_113 P14065_114 P14065_115 P14065_116 P14065_117 P14065_118
0.9765 0.9171 0.6607 1.735 1.498 0.9708
Table continues below
P14065_120 P14065_121 P14065_122 P14065_123 P14065_124 P14065_125
1.14 1.818 0.9036 0.978 1.913 1.655
Table continues below
P14065_126 P14065_127 P14065_128 P14065_129 P14065_130 P17901_119
1.053 1.06 1.105 1.285 1.122 0.9718
P17901_120 P17901_121 P17901_122 P17901_123 P17901_124
0.9308 0.8531 1.435 1.723 1.436
boxplot(sizes, main="Sequencing libraries size factor")
abline(h=1,lty=2,col="grey")

4.3 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(vst)>0,])

4.4 QC on the normalised data

4.4.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=2

An the number of possible combinations

nlevel=nlevels(dds$TISSUE) * nlevels(dds$CONDITION)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)

4.4.2 2D

The most variance comes from the tissues. In the leaf, the time has more effect than the treatment, while this look the opposite in the apex.

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(dds)))

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=CONDITION,shape=TISSUE,text=ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

4.4.3 Heatmap

Filter for noise

conds <- factor(paste(dds$TISSUE,dds$CONDITION))
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)

vst.cutoff <- 2
  • Heatmap of “all” genes
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
                distfun=pearson.dist,
                hclustfun=function(X){hclust(X,method="ward.D2")},
                labRow = NA,trace = "none",
                labCol = conds,
                col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=paste(dds$ID,conds),cex=0.6)

4.5 Conclusion

The main differences in the leaf samples are according to the days of sampling. The new T1 dexa samples cluster in T1 with a cluster of T1 havin both mock and Dexa. It would seem generally that the effect of Dexa vs. Mock in leaf at T1 is minimal.

For appices, the situation is somewhat different, the time and treatment seem to be both of importance. T0 samples merged with T1 dexa while T1 mock form its own cluster, possibly indicating a delay induced by the treatment. The newer samples form their own clusters, possibly indicating that a batch effect (most likely due to sampling rather than sequencing as the leaf sample do not show such an effect) has a stronger influence than the biological signal. At T3 there is a clear separation between mock and dexa.

With regards to the batch observed in the new appices samples, it is however probably marginal as it is not visible in the first 2 dimension of the PCA, that contribute 80% of the variance. So it will be within the whole dataset at most 3%. If anything, assuming the effect is due to the sampling, hence biological noise, it would only make the comparison more robust.

5 Session Info

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.58.0                  tximport_1.18.0            
##  [3] forcats_0.5.0               stringr_1.4.0              
##  [5] dplyr_1.0.3                 purrr_0.3.4                
##  [7] readr_1.4.0                 tidyr_1.1.2                
##  [9] tibble_3.0.5                tidyverse_1.3.0            
## [11] RColorBrewer_1.1-2          plotly_4.9.3               
## [13] pander_0.6.3                hyperSpec_0.99-20201127    
## [15] xml2_1.3.2                  ggplot2_3.3.3              
## [17] lattice_0.20-41             here_1.0.1                 
## [19] gplots_3.1.1                DESeq2_1.30.0              
## [21] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [23] MatrixGenerics_1.2.0        matrixStats_0.57.0         
## [25] GenomicRanges_1.42.0        GenomeInfoDb_1.26.2        
## [27] IRanges_2.24.1              S4Vectors_0.28.1           
## [29] BiocGenerics_0.36.0         data.table_1.13.6          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-0       ellipsis_0.3.1         rprojroot_2.0.2       
##  [4] XVector_0.30.0         fs_1.5.0               rstudioapi_0.13       
##  [7] hexbin_1.28.2          farver_2.0.3           affyio_1.60.0         
## [10] bit64_4.0.5            AnnotationDbi_1.52.0   fansi_0.4.2           
## [13] lubridate_1.7.9.2      splines_4.0.3          geneplotter_1.68.0    
## [16] knitr_1.30             jsonlite_1.7.2         broom_0.7.3           
## [19] annotate_1.68.0        dbplyr_2.0.0           png_0.1-7             
## [22] BiocManager_1.30.10    compiler_4.0.3         httr_1.4.2            
## [25] backports_1.2.1        assertthat_0.2.1       Matrix_1.3-2          
## [28] lazyeval_0.2.2         limma_3.46.0           cli_2.2.0             
## [31] htmltools_0.5.1        tools_4.0.3            affy_1.68.0           
## [34] gtable_0.3.0           glue_1.4.2             GenomeInfoDbData_1.2.4
## [37] Rcpp_1.0.6             cellranger_1.1.0       vctrs_0.3.6           
## [40] preprocessCore_1.52.1  crosstalk_1.1.1        xfun_0.20             
## [43] testthat_3.0.1         rvest_0.3.6            lifecycle_0.2.0       
## [46] gtools_3.8.2           XML_3.99-0.5           zlibbioc_1.36.0       
## [49] scales_1.1.1           hms_1.0.0              yaml_2.2.1            
## [52] memoise_1.1.0          latticeExtra_0.6-29    stringi_1.5.3         
## [55] RSQLite_2.2.2          highr_0.8              genefilter_1.72.0     
## [58] caTools_1.18.1         BiocParallel_1.24.1    rlang_0.4.10          
## [61] pkgconfig_2.0.3        bitops_1.0-6           evaluate_0.14         
## [64] labeling_0.4.2         htmlwidgets_1.5.3      bit_4.0.4             
## [67] tidyselect_1.1.0       magrittr_2.0.1         R6_2.5.0              
## [70] generics_0.1.0         DelayedArray_0.16.0    DBI_1.1.1             
## [73] pillar_1.4.7           haven_2.3.1            withr_2.4.0           
## [76] survival_3.2-7         RCurl_1.98-1.2         modelr_0.1.8          
## [79] crayon_1.3.4           KernSmooth_2.23-18     rmarkdown_2.6         
## [82] jpeg_0.1-8.1           locfit_1.5-9.4         readxl_1.3.1          
## [85] blob_1.2.1             reprex_0.3.0           digest_0.6.27         
## [88] xtable_1.8-4           munsell_0.5.0          viridisLite_0.3.0